title: “Assignment 3” author: “Group1” output: html_document —
Team member:Anusha Gunputh #Observation of missing -most data is missing in YearBuilt and Prices but does not mean data in the other variables are reliable and will show a proper indication on the effect of price which is the response.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(readr)
library(visdat)
library(naniar)
library(forcats)
library(modelr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(dplyr)
library(viridis)
## Loading required package: viridisLite
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Distance = col_double(),
## CouncilArea = col_character(),
## Lattitude = col_double(),
## Longtitude = col_double(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12094 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33 '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23 '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51 '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms,YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE))
summary_miss <- miss_summary(houses_select)
summary_miss$miss_df_prop
## [1] 0.127033
vis_miss(houses_select, sort_miss=TRUE) + theme(aspect.ratio=1)
#plotting variables with missing values- ggplot removed rows with missing values as below -bedroom vs price by type
ggplot(houses_select,
aes(x = Bedroom2,
y = Price)) +
geom_point() + theme(aspect.ratio=1)+ facet_wrap(~Type)
## Warning: Removed 5833 rows containing missing values (geom_point).
#Relationship between chosen variables- Bedroom & Rooms -Looks like price tends to increase while rooms increases, they have a linear relationship
ggplot(houses_select,
aes(x = Rooms,
y = Price)) +
scale_colour_brewer(palette="Dark2") +
geom_point(alpha=0.1) +
geom_smooth(method="lm") +
theme(aspect.ratio=1) + facet_wrap(~Type) +
scale_y_log10()
## Warning: Removed 5833 rows containing non-finite values (stat_smooth).
## Warning: Removed 5833 rows containing missing values (geom_point).
ggplot(houses_select,
aes(x = Bedroom2,
y = Price)) +
scale_colour_brewer(palette="Dark2") +
geom_point(alpha=0.1) +
geom_smooth(method="lm") +
theme(aspect.ratio=1) + scale_y_log10() + facet_wrap(~Type)
## Warning: Removed 5833 rows containing non-finite values (stat_smooth).
## Warning: Removed 5833 rows containing missing values (geom_point).
#Impute by type to show the missingness values in Prices -It appears most values of price is missing in the Type h than Type u and Type t. -One way to interpret the data collected is more data you collect for a particular type tend to leave rooms for invariability, hence more missingness.
Type_h <- bind_shadow(houses_select) %>%
filter(Type=="h") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2),
Price = ifelse(is.na(Price),
mean(Price, na.rm=TRUE),
Price))
Type_u <- bind_shadow(houses_select) %>%
filter(Type=="u") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2),
Price = ifelse(is.na(Price),
mean(Price, na.rm=TRUE),
Price))
Type_t <- bind_shadow(houses_select) %>%
filter(Type=="t") %>%
mutate(Bedroom2 = ifelse(is.na(Bedroom2),
mean(Bedroom2, na.rm=TRUE),
Bedroom2),
Price = ifelse(is.na(Price),
mean(Price, na.rm=TRUE),
Price))
Type_shadow <- bind_rows(Type_h, Type_u, Type_t)
ggplot(Type_shadow,
aes(x = Bedroom2,
y = Price,
colour=Price_NA)) +
geom_point(alpha=0.7) +
facet_grid(Type~.) +
scale_colour_brewer(palette="Dark2") +
theme(aspect.ratio=0.5)
#Dependencies on other variables- YearBuilt -The plot shows year starting late 1800’s data was collected consistently, no missing data for Bedroom2, however, data is missing for prices. -Particularly; the price missing for bedroom=30 and for a house build in 2106 and has 4 bedroom. lastly house build in 1196 and has 2 bedroom. It is reasonable to take it out based on the fact that we cannot compare it with modern houses since we do not know the state of the house and how well maintained it was. Old houses tend to cost millions of dollars in melbourne. It is important to get rid of incorrect data.. Outliers like this will make our model biased.
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms,YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE))
house_shadow1 <- bind_shadow(Melbourne_housing_FULL)
ggplot(data = house_shadow1,
aes(x = Rooms, y=YearBuilt, colour=Price_NA)) +
scale_colour_brewer(palette="Dark2") +
geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type)
## Warning: Removed 19306 rows containing missing values (geom_point).
ggplot(data = house_shadow1,
aes(x = Bedroom2, y=YearBuilt, colour=Price_NA)) +
scale_colour_brewer(palette="Dark2") +
geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type)
## Warning: Removed 19306 rows containing missing values (geom_point).
ggplot(data = house_shadow1,
aes(x = Bedroom2, y=Rooms, colour=Price_NA)) +
scale_colour_brewer(palette="Dark2") +
geom_point(alpha=0.7) + theme(aspect.ratio=1) + facet_wrap(~Type) + ylab("Room") +
xlab("Number of Bedroom")
## Warning: Removed 8217 rows containing missing values (geom_point).
#Boxplot- comparing medians -Looks like the median price is $910,000 for 6 bedroom and the starting price for 1 bedroom is $85,000, max price is $11,200,000 for 4 bedroom.
library(forcats)
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0) %>% filter(!is.na(Price)) %>% mutate(Bedroom2 = as.numeric(Bedroom2))
houses_select<-na.omit(houses_select)
houses_select <- rename(houses_select, Bedroom = Bedroom2)
house_box <- houses_select %>%
mutate(Bedroom=factor(Bedroom,
levels=c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10","16", "12")))
ggplot(data=house_box, aes(x=Bedroom, y=Price)) +
geom_boxplot()
summary(houses_select)
## Bedroom Price Type Rooms
## Min. : 1.000 Min. : 85000 h:15720 Min. : 1.000
## 1st Qu.: 2.000 1st Qu.: 660000 t: 1579 1st Qu.: 2.000
## Median : 3.000 Median : 910000 u: 3491 Median : 3.000
## Mean : 3.049 Mean : 1090213 Mean : 3.061
## 3rd Qu.: 4.000 3rd Qu.: 1335000 3rd Qu.: 4.000
## Max. :20.000 Max. :11200000 Max. :16.000
summary(houses_select$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85000 660000 910000 1090213 1335000 11200000
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 85000 635000 870000 1050173 1295000 11200000 7610
library(labelled)
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Distance = col_double(),
## CouncilArea = col_character(),
## Lattitude = col_double(),
## Longtitude = col_double(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12094 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33 '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23 '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51 '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price, Type, Rooms, YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(Bedroom2>0, Bedroom2<10) %>% filter(!is.na(Price)) %>% filter(!is.na(Rooms))%>% filter(YearBuilt>1200)
houses_select <- rename(houses_select, Bedroom = Bedroom2) %>% select(Price, Bedroom, Type, Rooms)
ggplot(houses_select, aes(x=Price, y=Bedroom,
colour=Type)) +
geom_point(alpha=0.3) +
scale_colour_brewer("Bedroom", palette="Dark2") +
facet_wrap(~Type, nrow=3) + scale_x_log10() + geom_smooth(method="lm", se=FALSE) +
theme(legend.position="bottom") +
xlab("Price") +
ylab("Number of Bedroom")
#Fitting a linear model- correlation between Price and Bedroom2
houses_select <- Melbourne_housing_FULL %>% select(Bedroom2, Price,Rooms, Type, YearBuilt) %>% mutate(Type = factor(Type, ordered=FALSE)) %>% filter(!is.na(Bedroom2))%>% filter(Bedroom2>0) %>% mutate(Bedroom2 = as.numeric(Bedroom2, ordered=FALSE)) %>% filter(!is.na(Price)) %>% mutate(Price_log = log10(Price)) %>% mutate(Price = as.numeric(Price),ordered=FALSE) %>% filter(YearBuilt>1200)
houses_select <- rename(houses_select, Bedroom = Bedroom2)
houses_fit1 <- lm(Price_log~Bedroom + Rooms, data=houses_select)
houses_fit2 <- lm(Price_log~Bedroom*Rooms, data=houses_select)
tidy(houses_fit1)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.57329360 0.005927029 940.318269 0.000000e+00
## 2 Bedroom 0.01107983 0.007810634 1.418557 1.560539e-01
## 3 Rooms 0.11746504 0.007793065 15.073021 7.068066e-51
##Analysis: without interaction
#Bedroom:Given there is one additional bedroom in any type of house, the price increases by 0.01347383
#Rooms: Given there is one additional roomns in any type of house, the price increases by 0.10509578
tidy(houses_fit2)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.41409489 0.011625162 465.722103 0.000000e+00
## 2 Bedroom 0.06850869 0.008536603 8.025287 1.106307e-15
## 3 Rooms 0.16886411 0.008366419 20.183559 4.000222e-89
## 4 Bedroom:Rooms -0.01690853 0.001065966 -15.862160 4.264035e-56
confint_tidy(houses_fit2)
## conf.low conf.high
## 1 5.39130771 5.43688207
## 2 0.05177558 0.08524180
## 3 0.15246459 0.18526363
## 4 -0.01899799 -0.01481906
##Analysis: with interaction
#Bedroom*Rooms:Given there is one additional bedroom and one additional rooms in any type of house, the price decreases by 0.01288828
#Conclusion: the more rooms we add to the type of house the price starts to decrease
glance(houses_fit1)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.2881387 0.2880208 0.1955431 2443.787 0 3 2574.546
## AIC BIC deviance df.residual
## 1 -5141.092 -5111.495 461.7132 12075
glance(houses_fit2)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.3026702 0.302497 0.193545 1746.873 0 4 2699.098 -5388.197
## BIC deviance df.residual
## 1 -5351.201 452.288 12074
#houses_fit2 has an interaction and is a better fit since r squared is higher than house_fit1
-The models are clearly different. The response is plotted against the two variable on two different level. And the interaction term makes it a much better fit as they interlaps each other while the non interaction term has paralell lines. Model 2 is definitely a better fit. -Model 1: Adding an additional bedroom to the type of house makes the price go up by a coefficient of 0.01107983 and adding an additional room will make it go up by 0.11746504. -Model 2: However, taking interaction into consideration, a house with one bedroom but adding an additional room will the price go down by a coefficient of -0.01690853 which mean even if a house has a lot of bedrooms, the fact that it has more rooms than bedroom or an equal number of rooms will make the price go down as compared to the mean price of any type.
house_mod1 <- augment(houses_fit1, houses_select)
house_mod2 <- augment(houses_fit2, houses_select)
p1 <- ggplot(house_mod1, aes(x=Bedroom, y=.fitted, colour=Rooms, group=Rooms)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 1: no interaction")
p2 <- ggplot(house_mod2, aes(x=Bedroom, y=.fitted, colour=Rooms, group=Rooms)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 2: with an interaction")
p3 <- ggplot(house_mod1, aes(x=Rooms, y=.fitted, colour=Bedroom, group=Bedroom)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 1: no interaction")
p4 <- ggplot(house_mod2, aes(x=Rooms, y=.fitted, colour=Bedroom, group=Bedroom)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 2: with an interaction")
grid.arrange(p1, p2, p3, p4, ncol=2)
#Residuals vs Fitted
p1 <- ggplot(house_mod2, aes(x=.fitted, y=.std.resid)) +
geom_point() + ggtitle("Model 1")
p2 <- ggplot(house_mod2, aes(x=.fitted, y=.std.resid)) +
geom_point() + ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)
#The residuals and fitted is the same since we took out the missing values.
room<-read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Distance = col_double(),
## CouncilArea = col_character(),
## Lattitude = col_double(),
## Longtitude = col_double(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12094 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33 '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23 '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51 '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
room<-select(room,Price,Rooms,Type)
Type_h<-bind_shadow(room)%>%
filter(Type=='h')%>%
mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_u<-bind_shadow(room)%>%
filter(Type=='u')%>%
mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_t<-bind_shadow(room)%>%
filter(Type=='t')%>%
mutate(Rooms=ifelse(is.na(Rooms),mean(Rooms,na.rm=TRUE),Rooms),Price=ifelse(is.na(Price),mean(Price,na.rm=TRUE),Price))
Type_shadow<-bind_rows(Type_h,Type_u,Type_t)
#missing values relationship with other variables- Rooms & Type
ggplot(Type_shadow,aes(x=Rooms,y=Price,colour=Price_NA))+geom_point(alpha=0.7)+facet_grid(Type~.)+scale_colour_brewer(palette='Dark2')+
theme(aspect.ratio = 0.5)
#Delete the missing value and mutate the variables as numeric
library(labelled)
room <- room %>%
filter(!is.na(Rooms)) %>%
filter(!is.na(Price)) %>%
select(Price,Rooms, Type) %>%
mutate(Rooms = as.numeric(Rooms)) %>%
mutate(Price = as.numeric(Price)) %>%
filter(Price>0)
library(ggplot2)
ggplot(room, aes(x = Rooms, y =log10(Price ))) + geom_point(alpha=0.1)+
geom_smooth(method="lm") + theme(aspect.ratio=1)+ facet_wrap(~ Type, nrow = 2)
#build two models to identify house types and rooms which may influence the house price
room <- room %>%
mutate(log_price = log10(Price)) %>%
mutate(type = factor(Type, ordered=FALSE))
mod1 <- lm(log_price~Type+Rooms, data=room)
mod2 <- lm(log_price~Type*Rooms, data=room)
tidy(mod1)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.72878127 0.004862399 1178.17995 0.000000e+00
## 2 Typet -0.05044409 0.003748501 -13.45713 3.784551e-41
## 3 Typeu -0.14605052 0.003309374 -44.13237 0.000000e+00
## 4 Rooms 0.08953308 0.001406000 63.67928 0.000000e+00
tidy(mod2)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.77808632 0.005418032 1066.454768 0.000000e+00
## 2 Typet -0.18962632 0.015762260 -12.030402 2.986799e-33
## 3 Typeu -0.32464511 0.009514007 -34.122858 6.199943e-250
## 4 Rooms 0.07468772 0.001580268 47.262704 0.000000e+00
## 5 Typet:Rooms 0.04598135 0.005233746 8.785553 1.645564e-18
## 6 Typeu:Rooms 0.07900155 0.004020568 19.649351 2.286745e-85
confint_tidy(mod2)
## conf.low conf.high
## 1 5.76746670 5.78870594
## 2 -0.22052116 -0.15873149
## 3 -0.34329305 -0.30599717
## 4 0.07159031 0.07778512
## 5 0.03572294 0.05623976
## 6 0.07112103 0.08688207
room_mod1 <- augment(mod1, room)
room_mod2 <- augment(mod2, room)
ggplot(room, aes(x=Rooms, y=log_price,
colour=Type)) +
geom_point(alpha=0.1) +
scale_colour_brewer("Type", palette="Dark2") +
facet_wrap(~Type, ncol=4) +
theme(legend.position="bottom") +
xlab("Rooms") +
ylab("log_price") +
geom_line(data=room_mod1, aes(y=.fitted)) +
ggtitle("Model 1")
ggplot(room, aes(x=Rooms, y=log_price,
colour=Type)) +
geom_point(alpha=0.1) +
scale_colour_brewer("Type", palette="Dark2") +
facet_wrap(~Type, ncol=4) +
theme(legend.position="bottom") +
xlab("Rooms") +
ylab("log_price") +
geom_line(data=room_mod2, aes(y=.fitted)) +
ggtitle("Model 2")
p1 <- ggplot(room_mod1, aes(x=Rooms, y=.fitted,
colour=Type)) +
geom_line() +
scale_colour_brewer("Type", palette="Dark2") +
theme(legend.position="bottom") +
xlab("Rooms") +
ylab("log_price") + ggtitle("Model 1")
p2 <- ggplot(room_mod2, aes(x=Rooms, y=.fitted,
colour=Type)) +
geom_line() +
scale_colour_brewer("Type", palette="Dark2") +
theme(legend.position="bottom") +
xlab("Rooms") +
ylab("log_price") + ggtitle("Model 2")
grid.arrange(p1, p2, ncol=2)
#compared two models,the mod2 is better than mod1
glance(mod1)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.3268587 0.3267846 0.1842156 4409.481 0 4 7432.523
## AIC BIC deviance df.residual
## 1 -14855.05 -14813.98 924.5021 27243
glance(mod2)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.3372794 0.3371578 0.1827909 2772.761 0 6 7645.074
## AIC BIC deviance df.residual
## 1 -15276.15 -15218.66 910.1901 27241
glance(room_mod1)
## nrow ncol complete.obs na.fraction
## 1 27247 12 27247 0
glance(room_mod2)
## nrow ncol complete.obs na.fraction
## 1 27247 12 27247 0
p1 <- ggplot(room_mod1, aes(x=Type, y=.fitted, colour=Rooms, group=Rooms)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 1: no interaction")
p2 <- ggplot(room_mod2, aes(x=Type, y=.fitted, colour=Rooms, group=Rooms)) +
geom_line() + scale_colour_viridis() +
theme(legend.position="none") + ggtitle("Model 2: with an interaction")
p3 <- ggplot(room_mod1, aes(x=Rooms, y=.fitted, colour=Type, group=Type)) +
geom_line() +
theme(legend.position="none") + ggtitle("Model 1: no interaction")
p4 <- ggplot(room_mod2, aes(x=Rooms, y=.fitted, colour=Type, group=Type)) +
geom_line() +
theme(legend.position="none") + ggtitle("Model 2: with an interaction")
grid.arrange(p1, p2, p3, p4, ncol=2)
# Team member: Xiaoru Chen
library(tidyverse)
library(modelr)
library(labelled)
library(broom)
ass2 <- read.csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
ass3 <- ass2
#choose the collumn of variables
assm <- select(ass3, Price, Distance, Rooms)
#plot the graph of missing values
library(visdat)
vis_dat(assm)
#Delete the missing value of Variables Distance and Rooms and select the collumn of variables
ass4 <- filter(ass3, !is.na(Distance)) %>% filter(!is.na(Rooms))
ass5 <- filter(ass4, !is.na(Price))
ass6 <- select(ass5, Price, Distance, Rooms) %>% mutate(Distance = to_factor(Distance, ordered=TRUE, drop_unused_labels = TRUE))
#Transfrom the variables into numberic type and create a new collumn with log10(Price) so the scale of price can bechanged into reasonable scale
ass7 <- mutate(ass6, Price = as.numeric(Price))
ass8 <- mutate(ass7, Distance = as.numeric(Distance))
ass9 <- mutate(ass7, Rooms = as.numeric(Rooms))
ass10 <- mutate(ass8, Price = log10(Price))
#Use ggplot to draw a graph with fitted line
p1 <- ggplot(ass10, mapping = aes(x=Distance, y=Price))+geom_point()+geom_smooth(method="lm", se=FALSE)
ggplot(ass10, mapping = aes(x=Rooms, y=Price))+geom_point()+geom_smooth(method="lm", se=FALSE)
#Create a new model of using distance and rooms as independent variables and price as dependent variable
pisa_lm <- lm(Price~Distance+Rooms, data=ass10)
#create a model with interaction
pisa_lm1 <- lm(Price~Distance+Rooms-Distance*Rooms, data=ass10)
#Use tidy and glance funciton to get the estimated values of parameters and the r squared
tidy(pisa_lm)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.5313586106 4.312030e-03 1282.77355 0.000000e+00
## 2 Distance 0.0004338575 1.617518e-05 26.82243 1.908242e-156
## 3 Rooms 0.1276028495 1.201559e-03 106.19777 0.000000e+00
glance(pisa_lm)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0.2970719 0.2970203 0.1882439 5756.938 0 3 6842.634
## AIC BIC deviance df.residual
## 1 -13677.27 -13644.42 965.4117 27244
tidy(pisa_lm1)
## term estimate std.error statistic p.value
## 1 (Intercept) 5.959711 0.00136016 4381.624 0
glance(pisa_lm1)
## r.squared adj.r.squared sigma statistic p.value df logLik
## 1 0 0 0.2245171 NA NA 1 2040.341
## AIC BIC deviance df.residual
## 1 -4076.683 -4060.257 1373.415 27246
#From the result we can see the model with interaction has r squared of 0, therefore we choose anther model with no interaction.
library(naniar)
library(visdat)
library(broom)
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggplot2)
Melbourne_housing_FULL <- read_csv("/Users/anne/ETC1010/Melbourne_housing_FULL.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## Suburb = col_character(),
## Address = col_character(),
## Type = col_character(),
## Method = col_character(),
## SellerG = col_character(),
## Date = col_character(),
## Distance = col_double(),
## CouncilArea = col_character(),
## Lattitude = col_double(),
## Longtitude = col_double(),
## Regionname = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 189 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 12094 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M… file 2 12096 BuildingArea no trailing characters .33 '/Users/anne/ETC1010/M… row 3 12139 BuildingArea no trailing characters .23 '/Users/anne/ETC1010/M… col 4 12223 BuildingArea no trailing characters .51 '/Users/anne/ETC1010/M… expected 5 12252 BuildingArea no trailing characters .3 '/Users/anne/ETC1010/M…
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.
vis_miss(Melbourne_housing_FULL, sort_miss=TRUE) + theme(aspect.ratio=1)
ggplot(Melbourne_housing_FULL,
aes(x = Bathroom,
y = Bedroom2)) +
geom_point() + theme(aspect.ratio=1)
## Warning: Removed 8226 rows containing missing values (geom_point).
#Warning message:
#Removed 3557 rows containing missing values (geom_point).
ggplot(Melbourne_housing_FULL,
aes(x = Bathroom,
y = Bedroom2)) +
scale_colour_brewer(palette="Dark2") +
geom_miss_point() + theme(aspect.ratio=1)
#21.7% missing values, 0.1240107, proportion of observations missing
#0.4285714 proportion of variables missing
#3557 rows contained missing values
#The plot showing missing values shows that the missing values are from the 0 values.
houses_select <- Melbourne_housing_FULL %>%
select(Bathroom, Price, Type, Rooms) %>%
mutate(Type = factor(Type, ordered = FALSE)) %>%
filter(Bathroom>0) %>% mutate(Bathroom = as.numeric(Bathroom, ordered=FALSE)) %>%
mutate(Price = as.numeric(Price)) %>%
filter(Price>0) %>%
filter(is.na(Price))%>% mutate(Price_log = log10(Price))
Type_h <- bind_shadow(Melbourne_housing_FULL) %>%
filter(Type=="h") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm = TRUE),
Bathroom),
Price = ifelse(is.na(Price),
mean(Price, na.rm = TRUE),
Price))
Type_u <- bind_shadow(Melbourne_housing_FULL) %>%
filter(Type=="u") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm=TRUE),
Bathroom),
Price = ifelse(is.na(Price),
mean(Price, na.rm=TRUE),
Price))
Type_t <- bind_shadow(Melbourne_housing_FULL) %>%
filter(Type=="t") %>%
mutate(Bathroom = ifelse(is.na(Bathroom),
mean(Bathroom, na.rm = TRUE),
Bathroom),
Price = ifelse(is.na(Price),
mean(Price, na.rm = TRUE),
Price))
Type_shadow <- bind_rows(Type_h, Type_u, Type_t)
#Plotted final missingness data, Bathroom and Price
ggplot(Type_shadow,
aes(x = Bathroom,
y = Price,
colour=Price_NA)) +
geom_point(alpha=0.7) +
facet_grid(Type~.) +
scale_colour_brewer(palette = "Dark2") +
theme(aspect.ratio = 0.5)
#Comparing the R square of the 3 models above, based on the R-squares 0.3372794 (log_price~Type*Rooms)which is higher than 0.2970719 (log10(price) vs distance*rooms) and 0.3026702 (log10(price) vs bedroom*rooms) , we can conclude that the second model which is (log_price~Type*Rooms) is the best model.
#We can be 95% confident that the mean increase in price is between 0.03572294 and 0.05623976 when the type is t and has 1 additional room compared to the type h and 95% confident of an increase in price between 0.07112103 and 0.08688207 when the type is u and has an additional rooms compared to type h.